# @Time : 2021/3/27 14:45
# @Author : chao

#代码参考自：https://blog.csdn.net/weixin_45314989/article/details/104390725?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_title-0&spm=1001.2101.3001.4242

#采用word2vec对分词后的文件进行训练，将每个词语映射到词向量空间
import logging
import multiprocessing
import os
import sys
from collections import Counter

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
#word2vec获取词向量
from pandas import np
from sklearn.metrics import silhouette_score

#word2vec
def wordsCluster():
    program = os.path.basename(sys.argv[0])  # 读取当前文件的文件名
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # inp为输入语料, outp1为输出模型, outp2为vector格式的模型
    inp = r"C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\去除停用词并分词\去除停用词并分词结果\zong_fengci_tingyongci2.txt"
    out_model = r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\word2vec\数据\corpusSegDone_zong2.model'
    out_vector = r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\word2vec\数据\corpusSegDone_zong2.txt'

    # 训练skip-gram模型
    #原始参数设置
    #model = Word2Vec(LineSentence(inp), size=100, window=5, min_count=5,
                     #workers=multiprocessing.cpu_count(), iter=5)
    #第二种参数设置
    model = Word2Vec(LineSentence(inp), size=100, window=3, min_count=5,
                     workers=multiprocessing.cpu_count(), iter=10, sg=1)
    # 保存模型
    model.save(out_model)
    # 保存词向量
    model.wv.save_word2vec_format(out_vector, binary=False)
    print("word2vec成功！！")

#将词向量的txt文本转换为csv文本
def changeTxtToCsv():
    out_vector = r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\word2vec\数据\corpusSegDone_zong2.txt'
    f = open(out_vector, "r", encoding='utf-8')
    new = []
    for line in f:
        new.append(line)
    new[0] = '\n'
    f.close()

    f = open(out_vector, "w", encoding='utf-8')
    for n in new:
        f.write(n)
    f.close()

    import csv
    with open(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\word2vec\数据\corpusSegDone_zong2.csv',
              'w', newline='') as csvfile:  ##data.csv是用来存放词向量的csv文件
        writer = csv.writer(csvfile)
        data = open(out_vector,encoding='utf-8')
        next(data)
        for each_line in data:
            a = each_line.split()
            writer.writerow(a)

    print("转换为csv文件成功！")

#用pca将100维的数据降维至2维
def jiangwei():
    import numpy as np
    from sklearn.decomposition import PCA
    l = []
    words = []
    with open(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\word2vec\数据\corpusSegDone_zong2.csv',
              'r') as fd:
        line = fd.readline()
        while line:
            if line == "":
                continue
            line = line.strip()
            word = line.split(",")
            words.append(word[0])
            l.append(word[1:])
            line = fd.readline()

    X = np.array(l)  # 导入数据，维度为300
    pca = PCA(n_components=2)  # 降到2维
    pca.fit(X)  # 训练
    newX = pca.fit_transform(X)  # 降维后的数据存放在newX列表中

    #将词汇和降维后的数据进行匹配
    dict={}
    for i in range(len(words)):
        word_=words[i]
        dict[word_]=newX[i]
    # for j in range(len(words)):
    #     print(words[j]+':',end='')
    #     print(dict[words[j]])

    #将高频名词的word2vec值从总数据的值（即dict中）中提取出来
    mingci_list = []
    with open(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\词性标注\名词提取\名词提取数据\高频名词（过滤后）.txt',
              'r', encoding='ANSI') as mf:
        for i in mf.readlines():
            mingci_list.append(i.strip('\n'))

    mingci_jiangwei_list = []
    for i in mingci_list:
        mingci_jiangwei_list.append(dict[i.strip('\n')])

    result = []
    result.append(mingci_jiangwei_list)
    result.append(mingci_list)
    print("降维成功！！")
    return result

#构建词向量字典并用k-means训练，得出分类情况
def k_means(mingci_jiangwei_list,mingci_list):
    from sklearn.cluster import KMeans
    import numpy as np
    from matplotlib import pyplot as plt

    #聚类个数
    num = 4
    X = np.array(mingci_jiangwei_list)
    kmeans = KMeans(n_clusters=num, random_state=0).fit(X)

    print(str(num) + "个中心词的坐标：")
    print(kmeans.cluster_centers_)



    list1=[]
    list2=[]
    list3=[]
    list4=[]
    list5=[]
    list6=[]
    list7=[]
    for j in range(len(mingci_list)):
        if kmeans.labels_[j]==0:
            list1.append(mingci_list[j])
        elif kmeans.labels_[j]==1:
            list2.append(mingci_list[j])
        elif kmeans.labels_[j]==2:
            list3.append(mingci_list[j])
        elif kmeans.labels_[j]==3:
            list4.append(mingci_list[j])
        elif kmeans.labels_[j]==4:
            list5.append(mingci_list[j])
        elif kmeans.labels_[j]==5:
            list6.append(mingci_list[j])
        elif kmeans.labels_[j]==6:
            list7.append(mingci_list[j])
    print("与关键词"+list1[0]+"相关的词有：",end='')
    print(list1)
    print("与关键词"+list2[0]+"相关的词有：",end='')
    print(list2)
    print("与关键词"+list3[0]+"相关的词有：",end='')
    print(list3)
    print("与关键词"+list4[0]+"相关的词有：",end='')
    print(list4)
    # print("与关键词"+list5[0]+"相关的词有：",end='')
    # print(list5)
    # print("与关键词"+list6[0]+"相关的词有：",end='')
    # print(list6)
    # print("与关键词"+list7[0]+"相关的词有：",end='')
    # print(list7)

    ##将数据用散点图可视化
    f1=[]
    f2=[]
    for i in range(len(mingci_jiangwei_list)):
        f1.append(mingci_jiangwei_list[i][0])
        f2.append(mingci_jiangwei_list[i][1])
    plt.scatter(f1, f2, c='blue', s=6)
    plt.show()


#肘部法则
def sse(mingci_jiangwei_list):
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np
    import matplotlib.pyplot as plt

    X = np.array(mingci_jiangwei_list)

    x1 = mingci_jiangwei_list[0]
    x2 = mingci_jiangwei_list[1]

    plt.plot()
    plt.xlim([0, 10])
    plt.ylim([0, 10])
    plt.title('Dataset')
    plt.scatter(x1, x2)
    plt.show()

    # create new plot and data
    plt.plot()
    #X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
    colors = ['b', 'g', 'r']
    markers = ['o', 'v', 's']

    # k means determine k
    distortions = []
    K = range(1, 10)
    for k in K:
        kmeanModel = KMeans(n_clusters=k).fit(X)
        kmeanModel.fit(X)
        distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

#轮廓系数
def cs(mingci_jiangwei_list):
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    from sklearn import metrics
    import numpy as np
    X = np.array(mingci_jiangwei_list)
    x1 = np.array(mingci_jiangwei_list[0])
    x2 = np.array(mingci_jiangwei_list[1])

    plt.xlim([0,10])
    plt.ylim([0,10])
    plt.title('Instances')
    plt.scatter(x1,x2)

    colors = ['b','g','r','c','m','y','k','b']
    markers = ['o','s','D','v','^','p','*','+']

    clusters=[2,3,4,5,8]
    subplot_counter = 1
    sc_scores = []

    for t in clusters:
        subplot_counter += 1
        plt.subplot(3,2,subplot_counter)
        kmeans_model = KMeans(n_clusters=t).fit(X)

        for i,l in enumerate(kmeans_model.labels_):
            plt.plot(x1[i], x2[i], color = colors[l], marker = markers[l], ls = 'None')
            plt.xlim([0,10])
            plt.ylim([0,10])
        sc_score = silhouette_score(X,kmeans_model.labels_,markers='euclidean')
        sc_scores.append(sc_score)
        plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
        plt.figure()

    plt.plot(clusters, sc_scores,'*-')
    plt.xlabel('Numbers of clusters')
    plt.ylabel('Silhouette Coefficient score')
    plt.show()

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets, metrics


def km_sse_cs(mingci_jiangwei_list):
    """
    KMeans算法效果评价
    1、簇内误方差(SSE, sum of the squared errors)，手肘法，肘部法，其大小表明函数拟合的好坏。
    使用图形工具肘部法，根据簇的数量来可视化簇内误方差。下降率突然变缓时即认为是最佳的k值（拐点）。
    当KMeans算法训练完成后，可以通过使用内置inertia属性来获取簇内的误方差。
    2、轮廓系数法（Silhouette Coefficient）结合了聚类的凝聚度（Cohesion）和分离度（Separation）
    平均轮廓系数的取值范围为[-1,1]，系数越大，聚类效果越好。当值为负时，暗含该点可能被误分了。
    :return:
    """
    data = np.array(mingci_jiangwei_list)

    # 存放设置不同簇数时的SSE值
    sse_list = []
    # 轮廓系数
    silhouettes = []
    # 循环设置不同的聚类簇数
    for i in range(2, 15):
        model = KMeans(n_clusters=i)
        model.fit(data)
        # kmeans算法inertia属性获取簇内的SSE
        sse_list.append(model.inertia_)
        # 轮廓系数
        silhouette = metrics.silhouette_score(data, model.labels_, metric='euclidean')
        silhouettes.append(silhouette)

    # 绘制簇内误方差曲线
    plt.subplot(211)
    plt.title('KMeans 簇内误方差')
    plt.plot(range(2, 15), sse_list, marker='*')
    plt.xlabel('簇数量')
    plt.ylabel('簇内误方差(SSE)')
    # 绘制轮廓系数曲线
    plt.subplot(212)
    plt.title('KMeans 轮廓系数')
    plt.plot(range(2, 15), silhouettes, marker='o')
    plt.xlabel('簇数量')
    plt.ylabel('轮廓系数')

    plt.tight_layout()
    plt.show()



if __name__ == '__main__':
    import matplotlib as mpl
    import numpy as np
    import pandas as pd
    wordsCluster()
    changeTxtToCsv()
    result = jiangwei()
    mingci_jiangwei_list = result[0]
    mingci_list = result[1]

#     dict1 = {}
#     # for i in range(len(mingci_list)):
#     #     dict1[mingci_list[i]] = mingci_jiangwei_list[i]
#
#     mc = ['电话','客服','速度','服务','物流','快递员','快递公司','态度','服务态度',"地方",'发货','信息','快件','时间','派件',
#           '客户','地址','网点','价格','收件','素质','中心','人员','城市','取件','货物','收费','员工','效率','站点',
#           '路程','快递小哥','消息','收货','短信','寄件','收件人','人工','总部','公司']
#     l1 = [5.053739052,
# 3.387399737,
# 0.905014078,
# 0.755818867,
# 2.105056332,
# 2.250978259,
# 0.184982331,
# 1.502261675,
# 0.907566609,
# 2.381225056,
# 2.623841509,
# 3.925825225,
# 2.763966254,
# 2.263841566,
# 3.729842687,
# 2.039049013,
# 4.040579003,
# 2.802176639,
# 0.697238595,
# 2.624132948,
# 0.939899061,
# 4.084781454,
# 1.633855279,
# 2.156806404,
# 2.796353867,
# 1.760987021,
# 0.385216873,
# 0.672600787,
# 0.52286417,
# 2.769640654,
# 3.108834454,
# 1.51831742,
# 2.76582056,
# 2.141260711,
# 3.560052139,
# 1.36458121,
# 3.185388298,
# 2.370262664,
# 2.62099793,
# 0.403213498]
#     l2 = [3.773499132,
# 2.473507117,
# -0.948704879,
# 3.631026065,
# -1.097728154,
# 3.823560556,
# 1.060504189,
# 5.29497159,
# 4.11038063,
# -0.795611256,
# -2.982904049,
# -1.012577754,
# -0.780390974,
# -0.672046862,
# -0.195538784,
# 3.844857893,
# 2.528308324,
# 2.593521217,
# 1.703598572,
# 0.22981862,
# 5.364636652,
# -4.04903025,
# 4.05175501,
# -2.365298893,
# 2.702292287,
# 0.724295646,
# 1.576230997,
# 3.357518572,
# 0.926009696,
# 1.221405156,
# -2.978330508,
# 2.853444783,
# -0.79702488,
# -0.236834618,
# 1.428812258,
# 0.471552767,
# 1.508863777,
# 2.491918428,
# 2.522049183,
# 1.964421869]
#     ls = [pd.Series(mc),pd.Series(l1),pd.Series(l2)]
#     data = pd.DataFrame(ls,index=['名词','特征值1','特征值2']).T
#     data.to_csv(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\杂\data.csv', encoding='utf_8_sig')
#     print(data)
    #k-means聚类
    # 肘部法则和轮廓系数和图
    # mpl.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong']
    km_sse_cs(mingci_jiangwei_list)

    k_means(mingci_jiangwei_list,mingci_list)




###################################################################
# 30个特征词的结果
# 与关键词电话
# 相关的词有：['电话\n', '客服\n', '地址\n', '网点\n']
# 与关键词速度
# 相关的词有：['速度\n', '物流\n', '地方\n', '发货\n', '信息\n', '快件\n', '时间\n', '派件\n', '卖家\n', '收件\n']
# 与关键词服务
# 相关的词有：['服务\n', '快递员\n', '态度\n', '服务态度\n', '客户\n', '素质']
# 与关键词公司
# 相关的词有：['公司\n', '快递公司\n', '钱\n', '价格\n']

##################################################################
#50个特征词的结果：
# k值为6时
# 与关键词速度
# 相关的词有：['速度\n', '卖家\n', '情况\n', '商家\n', '货物\n', '中国\n', '寄件\n']
# 与关键词服务
# 相关的词有：['服务\n', '快递员\n', '态度\n', '服务态度\n', '客户\n', '素质\n', '人员\n', '员工\n', '快递小哥\n']
# 与关键词物流
# 相关的词有：['物流\n', '地方\n', '信息\n', '快件\n', '时间\n', '派件\n', '收件\n', '消息\n', '收货\n']
# 与关键词电话
# 相关的词有：['电话\n', '客服\n', '地址\n', '网点\n', '取件\n', '手机\n', '站点\n', '短信\n', '收件人\n', '人工\n', '总部\n']
# 与关键词公司
# 相关的词有：['公司\n', '快递公司\n', '价格\n', '关门\n', '钱\n', '建议\n', '收费\n', '效率\n', '网购\n', '行业']
# 与关键词发货
# 相关的词有：['发货\n', '中心\n', '城市\n', '路程\n']